% !Mode:: "TeX:UTF-8"

% 01-Introduction
@book{rabiner1993fundamentals,
  title={Fundamentals of speech recognition},
  author={Rabiner, Lawrence R and Juang, Biing-Hwang and Rutledge, Janet C},
  volume={14},
  year={1993},
  publisher={PTR Prentice Hall Englewood Cliffs}
}

@article{graves2012sequence,
  title={Sequence transduction with recurrent neural networks},
  author={Graves, Alex},
  journal={arXiv preprint arXiv:1211.3711},
  year={2012}
}

@book{yu2016automatic,
  title={AUTOMATIC SPEECH RECOGNITION.},
  author={Yu, Dong and Deng, Li},
  year={2016},
  publisher={Springer}
}

@article{chung2014empirical,
  title={Empirical evaluation of gated recurrent neural networks on sequence modeling},
  author={Chung, Junyoung and Gulcehre, Caglar and Cho, KyungHyun and Bengio, Yoshua},
  journal={arXiv preprint arXiv:1412.3555},
  year={2014}
}

@inproceedings{zhang2016highway,
  title={Highway long short-term memory rnns for distant speech recognition},
  author={Zhang, Yu and Chen, Guoguo and Yu, Dong and Yaco, Kaisheng and Khudanpur, Sanjeev and Glass, James},
  booktitle={2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={5755--5759},
  year={2016},
  organization={IEEE}
}

@article{kim2017residual,
  title={Residual LSTM: Design of a deep recurrent architecture for distant speech recognition},
  author={Kim, Jaeyoung and El-Khamy, Mostafa and Lee, Jungwon},
  journal={arXiv preprint arXiv:1701.03360},
  year={2017}
}

@article{kalchbrenner2015grid,
  title={Grid long short-term memory},
  author={Kalchbrenner, Nal and Danihelka, Ivo and Graves, Alex},
  journal={arXiv preprint arXiv:1507.01526},
  year={2015}
}

@article{graves2005framewise,
  title={Framewise phoneme classification with bidirectional LSTM and other neural network architectures},
  author={Graves, Alex and Schmidhuber, J{\"u}rgen},
  journal={Neural Networks},
  volume={18},
  number={5-6},
  pages={602--610},
  year={2005},
  publisher={Elsevier}
}

@inproceedings{xue2017improving,
  title={Improving latency-controlled BLSTM acoustic models for online speech recognition},
  author={Xue, Shaofei and Yan, Zhijie},
  booktitle={2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={5340--5344},
  year={2017},
  organization={IEEE}
}

@inproceedings{sainath2015convolutional,
  title={Convolutional, long short-term memory, fully connected deep neural networks},
  author={Sainath, Tara N and Vinyals, Oriol and Senior, Andrew and Sak, Ha{\c{s}}im},
  booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={4580--4584},
  year={2015},
  organization={IEEE}
}

@article{lang1990time,
  title={A time-delay neural network architecture for isolated word recognition},
  author={Lang, Kevin J and Waibel, Alex H and Hinton, Geoffrey E},
  journal={Neural networks},
  volume={3},
  number={1},
  pages={23--43},
  year={1990},
  publisher={Elsevier}
}

@inproceedings{sercu2016very,
  title={Very deep multilingual convolutional neural networks for LVCSR},
  author={Sercu, Tom and Puhrsch, Christian and Kingsbury, Brian and LeCun, Yann},
  booktitle={2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={4955--4959},
  year={2016},
  organization={IEEE}
}

@article{sercu2016dense,
  title={Dense prediction on sequences with time-dilated convolutions for speech recognition},
  author={Sercu, Tom and Goel, Vaibhava},
  journal={arXiv preprint arXiv:1611.09288},
  year={2016}
}

@inproceedings{sainath2015learning,
  title={Learning the speech front-end with raw waveform CLDNNs},
  author={Sainath, Tara N and Weiss, Ron J and Senior, Andrew and Wilson, Kevin W and Vinyals, Oriol},
  booktitle={Sixteenth Annual Conference of the International Speech Communication Association},
  year={2015}
}

@inproceedings{graves2006connectionist,
  title={Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks},
  author={Graves, Alex and Fern{\'a}ndez, Santiago and Gomez, Faustino and Schmidhuber, J{\"u}rgen},
  booktitle={Proceedings of the 23rd international conference on Machine learning},
  pages={369--376},
  year={2006},
  organization={ACM}
}

@inproceedings{goodfellow2014generative,
  title={Generative adversarial nets},
  author={Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
  booktitle={Advances in neural information processing systems},
  pages={2672--2680},
  year={2014}
}

@article{li2017large,
  title={Large-scale domain adaptation via teacher-student learning},
  author={Li, Jinyu and Seltzer, Michael L and Wang, Xi and Zhao, Rui and Gong, Yifan},
  journal={arXiv preprint arXiv:1708.05466},
  year={2017}
}

@article{hinton2015distilling,
  title={Distilling the knowledge in a neural network},
  author={Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff},
  journal={arXiv preprint arXiv:1503.02531},
  year={2015}
}

@inproceedings{xue2013restructuring,
  title={Restructuring of deep neural network acoustic models with singular value decomposition.},
  author={Xue, Jian and Li, Jinyu and Gong, Yifan},
  booktitle={Interspeech},
  pages={2365--2369},
  year={2013}
}

@article{vanhoucke2011improving,
  title={Improving the speed of neural networks on CPUs},
  author={Vanhoucke, Vincent and Senior, Andrew and Mao, Mark Z},
  year={2011}
}

@inproceedings{sak2014long,
  title={Long short-term memory recurrent neural network architectures for large scale acoustic modeling},
  author={Sak, Ha{\c{s}}im and Senior, Andrew and Beaufays, Fran{\c{c}}oise},
  booktitle={Fifteenth annual conference of the international speech communication association},
  year={2014}
}

@inproceedings{vanhoucke2013multiframe,
  title={Multiframe deep neural networks for acoustic modeling},
  author={Vanhoucke, Vincent and Devin, Matthieu and Heigold, Georg},
  booktitle={2013 IEEE International Conference on Acoustics, Speech and Signal Processing},
  pages={7582--7585},
  year={2013},
  organization={IEEE}
}

@techreport{povey2011kaldi,
  title={The Kaldi speech recognition toolkit},
  author={Povey, Daniel and Ghoshal, Arnab and Boulianne, Gilles and Burget, Lukas and Glembek, Ondrej and Goel, Nagendra and Hannemann, Mirko and Motlicek, Petr and Qian, Yanmin and Schwarz, Petr and others},
  year={2011},
  institution={IEEE Signal Processing Society}
}

@article{paszke2017automatic,
  title={Automatic differentiation in pytorch},
  author={Paszke, Adam and Gross, Sam and Chintala, Soumith and Chanan, Gregory and Yang, Edward and DeVito, Zachary and Lin, Zeming and Desmaison, Alban and Antiga, Luca and Lerer, Adam},
  year={2017}
}


% 02-GMM-HMM
@article{hori2013speech,
  title={Speech recognition algorithms using weighted finite-state transducers},
  author={Hori, Takaaki and Nakamura, Atsushi},
  journal={Synthesis Lectures on Speech and Audio Processing},
  volume={9},
  number={1},
  pages={1--162},
  year={2013},
  publisher={Morgan \& Claypool Publishers}
}

@article{jelinek1975design,
  title={Design of a linguistic statistical decoder for the recognition of continuous speech},
  author={Jelinek, Frederick and Bahl, Lalit and Mercer, Robert},
  journal={IEEE Transactions on Information Theory},
  volume={21},
  number={3},
  pages={250--256},
  year={1975},
  publisher={IEEE}
}

@book{jelinek1997statistical,
  title={Statistical methods for speech recognition},
  author={Jelinek, Frederick},
  year={1997},
  publisher={MIT press}
}

@article{furui1986speaker,
  title={Speaker-independent isolated word recognition using dynamic features of speech spectrum},
  author={Furui, Sadaoki},
  journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
  volume={34},
  number={1},
  pages={52--59},
  year={1986},
  publisher={IEEE}
}

@article{kumar1998heteroscedastic,
  title={Heteroscedastic discriminant analysis and reduced rank HMMs for improved speech recognition},
  author={Kumar, Nagendra and Andreou, Andreas G},
  journal={Speech communication},
  volume={26},
  number={4},
  pages={283--297},
  year={1998},
  publisher={Elsevier}
}

@inproceedings{hermansky2000tandem,
  title={Tandem connectionist feature extraction for conventional HMM systems},
  author={Hermansky, Hynek and Ellis, Daniel PW and Sharma, Sangita},
  booktitle={2000 IEEE International Conference on Acoustics, Speech, and Signal Processing. Proceedings (Cat. No. 00CH37100)},
  volume={3},
  pages={1635--1638},
  year={2000},
  organization={IEEE}
}

@book{huang2001spoken,
  title={Spoken language processing: A guide to theory, algorithm, and system development},
  author={Huang, Xuedong and Acero, Alex and Hon, Hsiao-Wuen and Reddy, Raj},
  volume={1},
  year={2001},
  publisher={Prentice hall PTR Upper Saddle River}
}

@article{rabiner1989tutorial,
  title={A tutorial on hidden Markov models and selected applications in speech recognition},
  author={Rabiner, Lawrence R},
  journal={Proceedings of the IEEE},
  volume={77},
  number={2},
  pages={257--286},
  year={1989},
  publisher={Ieee}
}

@inproceedings{povey2002minimum,
  title={Minimum phone error and I-smoothing for improved discriminative training},
  author={Povey, Daniel and Woodland, Philip C},
  booktitle={2002 IEEE International Conference on Acoustics, Speech, and Signal Processing},
  volume={1},
  pages={I--105},
  year={2002},
  organization={IEEE}
}

@book{bellman2015applied,
  title={Applied dynamic programming},
  author={Bellman, Richard E and Dreyfus, Stuart E},
  volume={2050},
  year={2015},
  publisher={Princeton university press}
}

@article{lee1988large,
  title={Large-vocabulary speaker-independent continuous speech recognition: The SPHINX system},
  author={Lee, Kai-Fu},
  journal={Ph. D. Thesis},
  volume={88},
  year={1988},
  publisher={Carnegie-Mellon University}
}

@inproceedings{bahl1991decision,
  title={Decision trees for phonological rules in continuous speech},
  author={Bahl, Lalit R and Gopalakrishnan, PS and Nahamoo, D and Picheny, MA and others},
  booktitle={[Proceedings] ICASSP 91: 1991 International Conference on Acoustics, Speech, and Signal Processing},
  pages={185--188},
  year={1991},
  organization={IEEE}
}

@inproceedings{young1994tree,
  title={Tree-based state tying for high accuracy acoustic modelling},
  author={Young, Steve J and Odell, Julian J and Woodland, Philip C},
  booktitle={Proceedings of the workshop on Human Language Technology},
  pages={307--312},
  year={1994},
  organization={Association for Computational Linguistics}
}

@article{shinoda2000mdl,
  title={MDL-based context-dependent subword modeling for speech recognition},
  author={Shinoda, Koichi and Watanabe, Takao},
  journal={Acoustical Science and Technology},
  volume={21},
  number={2},
  pages={79--86},
  year={2000},
  publisher={Acoustical Society of Japan}
}

@article{watanabe2004variational,
  title={Variational Bayesian estimation and clustering for speech recognition},
  author={Watanabe, Shinji and Minami, Yasuhiro and Nakamura, Atsushi and Ueda, Naonori},
  journal={IEEE Transactions on Speech and Audio Processing},
  volume={12},
  number={4},
  pages={365--381},
  year={2004},
  publisher={IEEE}
}

@article{katz1987estimation,
  title={Estimation of probabilities from sparse data for the language model component of a speech recognizer},
  author={Katz, Slava},
  journal={IEEE transactions on acoustics, speech, and signal processing},
  volume={35},
  number={3},
  pages={400--401},
  year={1987},
  publisher={IEEE}
}

@article{good1953population,
  title={The population frequencies of species and the estimation of population parameters},
  author={Good, Irving J},
  journal={Biometrika},
  volume={40},
  number={3-4},
  pages={237--264},
  year={1953},
  publisher={Oxford University Press}
}




% 03-DNN-HMM
@article{dahl2012context,
  title={Context-dependent pre-trained deep neural networks for large-vocabulary speech recognition},
  author={Dahl, George E and Yu, Dong and Deng, Li and Acero, Alex},
  journal={IEEE Transactions on audio, speech, and language processing},
  volume={20},
  number={1},
  pages={30--42},
  year={2012},
  publisher={IEEE}
}



% 04-LSTM-Variants
@article{hinton2012deep,
  title={Deep neural networks for acoustic modeling in speech recognition},
  author={Hinton, Geoffrey and Deng, Li and Yu, Dong and Dahl, George and Mohamed, Abdel-rahman and Jaitly, Navdeep and Senior, Andrew and Vanhoucke, Vincent and Nguyen, Patrick and Kingsbury, Brian and others},
  journal={IEEE Signal processing magazine},
  volume={29},
  year={2012}
}

@article{bengio1994learning,
  title={Learning long-term dependencies with gradient descent is difficult},
  author={Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo and others},
  journal={IEEE transactions on neural networks},
  volume={5},
  number={2},
  pages={157--166},
  year={1994}
}

@article{boden2002guide,
  title={A guide to recurrent neural networks and backpropagation},
  author={Boden, Mikael},
  journal={the Dallas project},
  year={2002}
}

@book{jaeger2002tutorial,
  title={Tutorial on training recurrent neural networks, covering BPPT, RTRL, EKF and the" echo state network" approach},
  author={Jaeger, Herbert},
  volume={5},
  year={2002},
  publisher={GMD-Forschungszentrum Informationstechnik Bonn}
}

@article{rumelhart1988learning,
  title={Learning representations by back-propagating errors},
  author={Rumelhart, David E and Hinton, Geoffrey E and Williams, Ronald J and others},
  journal={Cognitive modeling},
  volume={5},
  number={3},
  pages={1},
  year={1988}
}

@inproceedings{graves2013speech,
  title={Speech recognition with deep recurrent neural networks},
  author={Graves, Alex and Mohamed, Abdel-rahman and Hinton, Geoffrey},
  booktitle={2013 IEEE international conference on acoustics, speech and signal processing},
  pages={6645--6649},
  year={2013},
  organization={IEEE}
}

@article{hochreiter1997long,
  title={Long short-term memory},
  author={Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
  journal={Neural computation},
  volume={9},
  number={8},
  pages={1735--1780},
  year={1997},
  publisher={MIT Press}
}

@article{gers1999learning,
  title={Learning to forget: Continual prediction with LSTM},
  author={Gers, Felix A and Schmidhuber, J{\"u}rgen and Cummins, Fred},
  year={1999},
  publisher={IET}
}

@article{gers2002learning,
  title={Learning precise timing with LSTM recurrent networks},
  author={Gers, Felix A and Schraudolph, Nicol N and Schmidhuber, J{\"u}rgen},
  journal={Journal of machine learning research},
  volume={3},
  number={Aug},
  pages={115--143},
  year={2002}
}

@article{graves2013generating,
  title={Generating sequences with recurrent neural networks},
  author={Graves, Alex},
  journal={arXiv preprint arXiv:1308.0850},
  year={2013}
}

@article{schmidhuber2015deep,
  title={Deep learning in neural networks: An overview},
  author={Schmidhuber, J{\"u}rgen},
  journal={Neural networks},
  volume={61},
  pages={85--117},
  year={2015},
  publisher={Elsevier}
}

@inproceedings{sak2014long,
  title={Long short-term memory recurrent neural network architectures for large scale acoustic modeling},
  author={Sak, Ha{\c{s}}im and Senior, Andrew and Beaufays, Fran{\c{c}}oise},
  booktitle={Fifteenth annual conference of the international speech communication association},
  year={2014}
}

@inproceedings{sak2014sequence,
  title={Sequence discriminative distributed training of long short-term memory recurrent neural networks},
  author={Sak, Ha{\c{s}}im and Vinyals, Oriol and Heigold, Georg and Senior, Andrew and McDermott, Erik and Monga, Rajat and Mao, Mark},
  booktitle={Fifteenth annual conference of the international speech communication association},
  year={2014}
}

@article{lei2017training,
  title={Training rnns as fast as cnns},
  author={Lei, Tao and Zhang, Yu and Artzi, Yoav},
  journal={arXiv preprint arXiv:1709.02755},
  year={2017}
}

@inproceedings{li2015lstm,
  title={LSTM time and frequency recurrence for automatic speech recognition},
  author={Li, Jinyu and Mohamed, Abdelrahman and Zweig, Geoffrey and Gong, Yifan},
  booktitle={2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
  pages={187--191},
  year={2015},
  organization={IEEE}
}

@inproceedings{li2016exploring,
  title={Exploring multidimensional LSTMs for large vocabulary ASR},
  author={Li, Jinyu and Mohamed, Abdelrahman and Zweig, Geoffrey and Gong, Yifan},
  booktitle={2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={4940--4944},
  year={2016},
  organization={IEEE}
}

@article{mohamed2012understanding,
  title={Understanding how deep belief networks perform acoustic modelling},
  author={Mohamed, Abdel-rahman and Hinton, Geoffrey and Penn, Gerald},
  journal={neural networks},
  pages={6--9},
  year={2012}
}

@inproceedings{li2012improving,
  title={Improving wideband speech recognition using mixed-bandwidth training data in CD-DNN-HMM},
  author={Li, Jinyu and Yu, Dong and Huang, Jui-Ting and Gong, Yifan},
  booktitle={2012 IEEE Spoken Language Technology Workshop (SLT)},
  pages={131--136},
  year={2012},
  organization={IEEE}
}

@article{sainath2016modeling,
  title={Modeling time-frequency patterns with LSTM vs. convolutional architectures for LVCSR tasks},
  author={Sainath, Tara N and Li, Bo},
  year={2016}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@article{zhang2015feedforward,
  title={Feedforward sequential memory neural networks without recurrent feedback},
  author={Zhang, Shiliang and Jiang, Hui and Wei, Si and Dai, Lirong},
  journal={arXiv preprint arXiv:1510.02693},
  year={2015}
}

@article{zhang2015feedforward2,
  title={Feedforward sequential memory networks: A new structure to learn long-term dependency},
  author={Zhang, Shiliang and Liu, Cong and Jiang, Hui and Wei, Si and Dai, Lirong and Hu, Yu},
  journal={arXiv preprint arXiv:1512.08301},
  year={2015}
}

@inproceedings{zhang2016compact,
  title={Compact Feedforward Sequential Memory Networks for Large Vocabulary Continuous Speech Recognition.},
  author={Zhang, Shiliang and Jiang, Hui and Xiong, Shifu and Wei, Si and Dai, Li-Rong},
  booktitle={Interspeech},
  pages={3389--3393},
  year={2016}
}

@inproceedings{zhang2018deep,
  title={Deep-FSMN for large vocabulary continuous speech recognition},
  author={Zhang, Shiliang and Lei, Ming and Yan, Zhijie and Dai, Lirong},
  booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={5869--5873},
  year={2018},
  organization={IEEE}
}

@article{oord2016wavenet,
  title={Wavenet: A generative model for raw audio},
  author={Oord, Aaron van den and Dieleman, Sander and Zen, Heiga and Simonyan, Karen and Vinyals, Oriol and Graves, Alex and Kalchbrenner, Nal and Senior, Andrew and Kavukcuoglu, Koray},
  journal={arXiv preprint arXiv:1609.03499},
  year={2016}
}



% 05-LSTM-Variants-Experiement
@inproceedings{bu2017aishell,
  title={AIShell-1: An open-source Mandarin speech corpus and a speech recognition baseline},
  author={Bu, Hui and Du, Jiayu and Na, Xingyu and Wu, Bengu and Zheng, Hao},
  booktitle={2017 20th Conference of the Oriental Chapter of the International Coordinating Committee on Speech Databases and Speech I/O Systems and Assessment (O-COCOSDA)},
  pages={1--5},
  year={2017},
  organization={IEEE}
}

@article{du2018aishell,
  title={AISHELL-2: Transforming Mandarin ASR Research Into Industrial Scale},
  author={Du, Jiayu and Na, Xingyu and Liu, Xuechen and Bu, Hui},
  journal={arXiv preprint arXiv:1808.10583},
  year={2018}
}

@inproceedings{ko2017study,
  title={A study on data augmentation of reverberant speech for robust speech recognition},
  author={Ko, Tom and Peddinti, Vijayaditya and Povey, Daniel and Seltzer, Michael L and Khudanpur, Sanjeev},
  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2017 IEEE International Conference on. IEEE},
  pages={5220--5224},
  year={2017}
}



% 06-End-to-End-ASR
@inproceedings{chan2016listen,
  title={Listen, attend and spell: A neural network for large vocabulary conversational speech recognition},
  author={Chan, William and Jaitly, Navdeep and Le, Quoc and Vinyals, Oriol},
  booktitle={2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={4960--4964},
  year={2016},
  organization={IEEE}
}

@inproceedings{dong2018speech,
  title={Speech-transformer: a no-recurrence sequence-to-sequence model for speech recognition},
  author={Dong, Linhao and Xu, Shuang and Xu, Bo},
  booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={5884--5888},
  year={2018},
  organization={IEEE}
}

@article{zhou2018syllable,
  title={Syllable-based sequence-to-sequence speech recognition with the transformer in Mandarin Chinese},
  author={Zhou, Shiyu and Dong, Linhao and Xu, Shuang and Xu, Bo},
  journal={arXiv preprint arXiv:1804.10752},
  year={2018}
}

@inproceedings{zhou2018comparison,
  title={A comparison of modeling units in sequence-to-sequence speech recognition with the transformer on mandarin chinese},
  author={Zhou, Shiyu and Dong, Linhao and Xu, Shuang and Xu, Bo},
  booktitle={International Conference on Neural Information Processing},
  pages={210--220},
  year={2018},
  organization={Springer}
}

@inproceedings{vaswani2017attention,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  booktitle={Advances in Neural Information Processing Systems},
  pages={5998--6008},
  year={2017}
}

@article{ba2016layer,
  title={Layer normalization},
  author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
  journal={arXiv preprint arXiv:1607.06450},
  year={2016}
}

@article{wu2016google,
  title={Google's neural machine translation system: Bridging the gap between human and machine translation},
  author={Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V and Norouzi, Mohammad and Macherey, Wolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey, Klaus and others},
  journal={arXiv preprint arXiv:1609.08144},
  year={2016}
}

@article{sennrich2015neural,
  title={Neural machine translation of rare words with subword units},
  author={Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
  journal={arXiv preprint arXiv:1508.07909},
  year={2015}
}

@inproceedings{chollet2017xception,
  title={Xception: Deep learning with depthwise separable convolutions},
  author={Chollet, Fran{\c{c}}ois},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1251--1258},
  year={2017}
}

@inproceedings{szegedy2016rethinking,
  title={Rethinking the inception architecture for computer vision},
  author={Szegedy, Christian and Vanhoucke, Vincent and Ioffe, Sergey and Shlens, Jon and Wojna, Zbigniew},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2818--2826},
  year={2016}
}














@inproceedings{rohlicek1989continuous,
	title={Continuous hidden Markov modeling for speaker-independent word spotting},
	author={Rohlicek, J Robin and Russell, William and Roukos, Salim and Gish, Herbert},
	booktitle={ICASSP1989}
}


@inproceedings{chen2014small,
	title={Small-footprint keyword spotting using deep neural networks},
	author={Chen, Guoguo and Parada, Carolina and Heigold, Georg},
	booktitle={ICASSP2014},
	%pages={4087--4091},
	%year={2014},
	%organization={IEEE}
}

@article{arik2017convolutional,
	title={Convolutional recurrent neural networks for small-footprint keyword spotting},
	author={Arik, Sercan O and Kliegl, Markus and Child, Rewon and Hestness, Joel and Gibiansky, Andrew and Fougner, Chris and Prenger, Ryan and Coates, Adam},
	journal={arXiv preprint arXiv:1703.05390},
	year={2017}
}

@inproceedings{shan2018attention2,
	title={Attention-based End-to-End Models for Small-Footprint Keyword Spotting},
	author={Shan, Changhao and Zhang, Junbo and Wang, Yujun and Xie, Lei},
	booktitle={Interspeech2018},
	%year={2018}
}

@article{hinton2012deep,
	title={Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups},
	author={Hinton, Geoffrey and Deng, Li and Yu, Dong and Dahl, George E and Mohamed, Abdel and Jaitly, Navdeep and Senior, Andrew and Vanhoucke, Vincent and Nguyen, Patrick and Sainath, Tara N and others},
	journal={IEEE Signal Processing Magazine},
	volume={29},
	number={6},
	pages={82--97},
	year={2012},
	publisher={IEEE}
}

@inproceedings{bahdanau2016end,
	title={End-to-end attention-based large vocabulary speech recognition},
	author={Bahdanau, Dzmitry and Chorowski, Jan and Serdyuk, Dmitriy and Brakel, Philemon and Bengio, Yoshua},
	booktitle={ICASSP2016}
}

@inproceedings{chan2016listen,
	title={Listen, attend and spell: A neural network for large vocabulary conversational speech recognition},
	author={Chan, William and Jaitly, Navdeep and Le, Quoc and Vinyals, Oriol},
	booktitle={ICASSP2016}
}

@inproceedings{morgan1990continuous,
	title={Continuous speech recognition using multilayer perceptrons with hidden Markov models},
	author={Morgan, Nelson and Bourlard, Herve},
	booktitle={ICASSP1990},
}

@article{schmidhuber2015deep,
	title={Deep learning in neural networks: An overview},
	author={Schmidhuber, J{\"u}rgen},
	journal={Neural networks},
	volume={61},
	pages={85--117},
	year={2015},
	publisher={Elsevier}
}

@inproceedings{sainath2015convolutional,
	title={Convolutional neural networks for small-footprint keyword spotting},
	author={Sainath, Tara N and Parada, Carolina},
	booktitle={Interspeech2015},
	year={2015}
}

@article{hochreiter1997long,
	title={Long short-term memory},
	author={Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
	journal={Neural computation},
	volume={9},
	number={8},
	pages={1735--1780},
	year={1997},
	publisher={MIT Press}
}

@inproceedings{peddinti2015time,
	title={A time delay neural network architecture for efficient modeling of long temporal contexts},
	author={Peddinti, Vijayaditya and Povey, Daniel and Khudanpur, Sanjeev},
	booktitle={Interspeech2015}
}

@inproceedings{xiong2017microsoft,
	title={The Microsoft 2016 conversational speech recognition system},
	author={Xiong, Wayne and Droppo, Jasha and Huang, Xuedong and Seide, Frank and Seltzer, Mike and Stolcke, Andreas and Yu, Dong and Zweig, Geoffrey},
	booktitle={ICASSP2017}
}

@inproceedings{deng2013recent,
	title={Recent advances in deep learning for speech research at Microsoft},
	author={Deng, Li and Li, Jinyu and Huang, JuiTing and Yao, Kaisheng and Yu, Dong and Seide, Frank and Seltzer, Michael and Zweig, Geoff and He, Xiaodong and Williams, Jason and others},
	booktitle={ICASSP2013}
}

@article{jim1996analysis,
	title={An analysis of noise in recurrent neural networks: convergence and generalization},
	author={Jim, Kam-Chuen and Giles, C Lee and Horne, Bill G},
	journal={IEEE Transactions on neural networks},
	year={1996},
}

@inproceedings{graves2006connectionist,
	title={Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks},
	author={Graves, Alex and Fern{\'a}ndez, Santiago and Gomez, Faustino and Schmidhuber, J{\"u}rgen},
	booktitle={ICML2006}
}

@inproceedings{amodei2016deep,
	title={Deep speech 2: End-to-end speech recognition in English and Mandarin},
	author={Amodei, Dario and Ananthanarayanan, Sundaram and Anubhai, Rishita and Bai, Jingliang and Battenberg, Eric and Case, Carl and Casper, Jared and Catanzaro, Bryan and Cheng, Qiang and Chen, Guoliang and others},
	booktitle={ICML2016}
}

@inproceedings{miao2015eesen,
	title={EESEN: End-to-end speech recognition using deep RNN models and WFST-based decoding},
	author={Miao, Yajie and Gowayyed, Mohammad and Metze, Florian},
	booktitle={ASRU2015}
}

@article{graves2012sequence,
	title={Sequence transduction with recurrent neural networks},
	author={Graves, Alex},
	journal={arXiv preprint arXiv:1211.3711},
	year={2012}
}

@inproceedings{shan2017attention,
	title={ATTENTION-BASED END-TO-END SPEECH RECOGNITION ON VOICE SEARCH},
	author={Shan, Changhao and Zhang, Junbo and Wang, Yujun and Xie, Lei},
	booktitle={ICASSP2017}
}

@inproceedings{shan2017Investigating,
	title={Investigating End-to-End Speech Recognition for Mandarin-English Code-Switching},
	author={Shan, Changhao and Weng, Chao and Wang, Guangsen and Su, Dan and Yu, Dong and Xie, Lei},
	booktitle={ICASSP2019}
}

@inproceedings{shan2017Component,
	title={Component Fusion: Learning Replaceable Language Model Component For End-to-End Speech Recognition system},
	author={Shan, Changhao and Weng, Chao and Wang, Guangsen and Su, Dan and Yu, Dong and Xie, Lei},
	booktitle={ICASSP2019}
}

@article{chorowski2014end,
	title={End-to-end continuous speech recognition using attention-based recurrent NN: first results},
	author={Chorowski, Jan and Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
	journal={arXiv preprint arXiv:1412.1602},
	year={2014}
}

@inproceedings{chorowski2015attention,
	title={Attention-based models for speech recognition},
	author={Chorowski, Jan K and Bahdanau, Dzmitry and Serdyuk, Dmitriy and Cho, Kyunghyun and Bengio, Yoshua},
	booktitle={Advances in Neural Information Processing Systems},
	year={2015}
}

@inproceedings{vaswani2017attention,
	title={Attention is all you need},
	author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
	booktitle={Advances in Neural Information Processing Systems},
	pages={5998--6008},
	year={2017}
}

@inproceedings{Sz2005Comparison,
	title={Comparison of keyword spotting approaches for informal continuous speech},
	author={Szoke, Igor and Schwarz, Petr and Matejka, Pavel and Burget, Luk{\'a}s and Karafi{\'a}t, Martin and Fapso, Michal and Cernocky, Jan},
	booktitle={Interspeech2005 }
}

@inproceedings{sun2016max,
	title={Max-pooling loss training of long short-term memory networks for small-footprint keyword spotting},
	author={Sun, Ming and Raju, Anirudh and Tucker, George and Panchapagesan, Sankaran and Fu, Gengshen and Mandal, Arindam and Matsoukas, Spyros and Strom, Nikko and Vitaladevuni, Shiv},
	booktitle={SLT2016},
}

@inproceedings{bai2016end,
	title={End-to-end keywords spotting based on connectionist temporal classification for Mandarin},
	author={Bai, Ye and Yi, Jiangyan and Ni, Hao and Wen, Zhengqi and Liu, Bin and Li, Ya and Tao, Jianhua},
	booktitle={ISCSLP2016}
}

@inproceedings{Rosenberg2017End,
	title={End-to-end speech recognition and keyword search on low-resource languages},
	author={Rosenberg, Andrew and Audhkhasi, Kartik and Sethy, Abhinav and Ramabhadran, Bhuvana and Picheny, Michael},
	booktitle={ICASSP2017}
}

@article{hwang2015online,
	title={Online keyword spotting with a character-level recurrent neural network},
	author={Hwang, Kyuyeon and Lee, Minjae and Sung, Wonyong},
	journal={arXiv preprint arXiv:1512.08903},
	year={2015}
}

@article{he2017streaming,
	title={Streaming Small-Footprint Keyword Spotting using Sequence-to-Sequence Models},
	author={He, Yanzhang and Prabhavalkar, Rohit and Rao, Kanishka and Li, Wei and Bakhtin, Anton and McGraw, Ian},
	journal={arXiv preprint arXiv:1710.09617},
	year={2017}
}

@article{cho2014learning,
	title={Learning phrase representations using RNN encoder-decoder for statistical machine translation},
	author={Cho, Kyunghyun and Van Merri{\"e}nboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua},
	journal={arXiv preprint arXiv:1406.1078},
	year={2014}
}

@article{bahdanau2014neural,
	title={Neural machine translation by jointly learning to align and translate},
	author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
	journal={arXiv preprint arXiv:1409.0473},
	year={2014}
}

@article{rush2015neural,
	title={A neural attention model for abstractive sentence summarization},
	author={Rush, Alexander M and Chopra, Sumit and Weston, Jason},
	journal={arXiv preprint arXiv:1509.00685},
	year={2015}
}

@article{chowdhury2017attention,
	title={Attention-based models for text-dependent speaker verification},
	author={Chowdhury, FA and Wang, Quan and Moreno, Ignacio Lopez and Wan, Li},
	journal={arXiv preprint arXiv:1710.10470},
	year={2017}
}

@inproceedings{wang2017trainable,
	title={Trainable frontend for robust and far-field keyword spotting},
	author={Wang, Yuxuan and Getreuer, Pascal and Hughes, Thad and Lyon, Richard F and Saurous, Rif A},
	booktitle={ICASSP2017},
}

@inproceedings{motlicek2012improving,
	title={Improving acoustic based keyword spotting using LVCSR lattices},
	author={Motlicek, Petr and Valente, Fabio and Szoke, Igor},
	booktitle={ICASSP2012}
}

@article{can2011lattice,
	title={Lattice indexing for spoken term detection},
	author={Can, Do{\u{g}}an and Saraclar, Murat},
	journal={IEEE Transactions on Audio, Speech, and Language Processing},
	volume={19},
	number={8},
	pages={2338--2347},
	year={2011},
	publisher={IEEE}
}

@inproceedings{bu2017aishell,
	title={AIShell-1: An open-source Mandarin speech corpus and a speech recognition baseline},
	author={Bu, Hui and Du, Jiayu and Na, Xingyu and Wu, Bengu and Zheng, Hao},
	booktitle={O-COCOSDA2017}
}

@article{kingma2014adam,
	title={Adam: A method for stochastic optimization},
	author={Kingma, Diederik P and Ba, Jimmy},
	journal={arXiv preprint arXiv:1412.6980},
	year={2014}
}

@inproceedings{chan2016online,
	title={On Online Attention-Based Speech Recognition and Joint Mandarin Character-Pinyin Training.},
	author={Chan, William and Lane, Ian},
	booktitle={Interspeech2016}
}

@article{hori2017advances,
	title={Advances in joint CTC-LAS based end-to-end speech recognition with a deep CNN encoder and RNN-LM},
	author={Hori, Takaaki and Watanabe, Shinji and Zhang, Yu and Chan, William},
	journal={arXiv preprint arXiv:1706.02737},
	year={2017}
}

@inproceedings{zhang2017very,
	title={Very deep convolutional networks for end-to-end speech recognition},
	author={Zhang, Yu and Chan, William and Jaitly, Navdeep},
	booktitle={ICASSP2017}
}

@inproceedings{kim2017joint,
	title={Joint CTC-LAS based end-to-end speech recognition using multi-task learning},
	author={Kim, Suyoun and Hori, Takaaki and Watanabe, Shinji},
	booktitle={ICASSP2017}
}

@article{ochiai2017multichannel,
	title={Multichannel End-to-end Speech Recognition},
	author={Ochiai, Tsubasa and Watanabe, Shinji and Hori, Takaaki and Hershey, John R},
	journal={arXiv preprint arXiv:1703.04783},
	year={2017}
}

@article{chorowski2016towards,
	title={Towards better decoding and language model integration in sequence to sequence models},
	author={Chorowski, Jan and Jaitly, Navdeep},
	journal={arXiv preprint arXiv:1612.02695},
	year={2016}
}

@article{Glorot2010Understanding,
	title={Understanding the difficulty of training deep feedforward neural networks},
	author={Glorot, Xavier and Bengio, Yoshua},
	journal={Journal of Machine Learning Research},
	volume={9},
	pages={249-256},
	year={2010},
}

@inproceedings{xu2015show,
	title={Show, attend and tell: Neural image caption generation with visual attention},
	author={Xu, Kelvin and Ba, Jimmy and Kiros, Ryan and Cho, Kyunghyun and Courville, Aaron and Salakhudinov, Ruslan and Zemel, Rich and Bengio, Yoshua},
	booktitle={ICML2015}
}

@article{wu2016google,
	title={Google's neural machine translation system: Bridging the gap between human and machine translation},
	author={Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V and Norouzi, Mohammad and Macherey, Wolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey, Klaus and others},
	journal={arXiv preprint arXiv:1609.08144},
	year={2016}
}

@inproceedings{hinton1993keeping,
	title={Keeping the neural networks simple by minimizing the description length of the weights},
	author={Hinton, Geoffrey E and Van Camp, Drew},
	booktitle={Interspeech1993}
}

@article{sak2015fast,
	title={Fast and accurate recurrent neural network acoustic models for speech recognition},
	author={Sak, Ha{\c{s}}im and Senior, Andrew and Rao, Kanishka and Beaufays, Fran{\c{c}}oise},
	journal={arXiv preprint arXiv:1507.06947},
	year={2015}
}

@inproceedings{miao2016simplifying,
	title={Simplifying long short-term memory acoustic models for fast training and decoding},
	author={Miao, Yajie and Li, Jinyu and Wang, Yongqiang and Zhang, Shi-Xiong and Gong, Yifan},
	booktitle={ICASSP2016}
}

@inproceedings{Prabhavalkar2017,
	title={A Comparison of Sequence-to-sequence Models for Speech Recognition},
	author={R. Prabhavalkar and K. Rao and T. N. Sainath and B. Li and L. Johnson and N. Jaitly},
	booktitle={Interspeech2017}
}

@inproceedings{Vu2012A,
	title={A first speech recognition system for Mandarin-English code-switch conversational speech},
	author={Vu, Ngoc Thang and Lyu, Dau Cheng and Weiner, Jochen and Telaar, Dominic and Schlippe, Tim and Blaicher, Fabian and Chng, Eng Siong and Schultz, Tanja and Li, Haizhou},
	booktitle={ICASSP2012}
}

% unit merging, Mandarin-English
@inproceedings{yeh2014transcribing,
	title={Transcribing code-switched bilingual lectures using deep neural networks with unit merging in acoustic modeling},
	author={Yeh, Ching-Feng and Lee, Lin-Shan},
	booktitle={ICASSP2014}
}

% Acoustic model, decision tree, merge, SEAME
@inproceedings{Li2011Asymmetric,
	title={Asymmetric acoustic modeling of mixed language speech},
	author={Li, Ying and Fung, Pascale and Xu, Ping and Liu, Yi},
	booktitle={ICASSP2011}
}

% Mandarin-English, adaptation, unit merging, 
@inproceedings{yeh2011bilingual,
	title={Bilingual acoustic model adaptation by unit merging on different levels and cross-level integration},
	author={Yeh, Ching-Feng and Huang, Chao-Yu and Lee, Lin-Shan},
	booktitle={Interspeech2011},
}

% unit mergeing, Mandarin-English
@inproceedings{yeh2011bilingual2,
	title={Bilingual acoustic modeling with state mapping and three-stage adaptation for transcribing unbalanced code-mixed lectures},
	author={Yeh, Ching-Feng and Sun, Liang-Che and Huang, Chao-Yu and Lee, Lin-Shan},
	booktitle={ICASSP2011}
}

% Mandarin-English, Language model, POS, predict code-switching, SEAME
@inproceedings{Adel2013Recurrent,
	title={Recurrent neural network language modeling for code switching conversational speech},
	author={Adel, Heike and Vu, Ngoc Thang and Kraus, Franziska and Schlippe, Tim and Li, Haizhou and Schultz, Tanja},
	booktitle={ICASSP2013}
}

% lexicon, new transcribers, SEAME
@inproceedings{guo2018study,
	title={Study of Semi-supervised Approaches to Improving English-Mandarin Code-Switching Speech Recognition},
	author={Guo, Pengcheng and Xu, Haihua and Xie, Lei and Chng, Eng Siong},
	booktitle={Interspeech2018}
}

% Spanish-English
@incollection{guzman2017metrics,
	title={Metrics for modeling code-switching across corpora},
	author={Guzm{\'a}n, Gualberto and Ricard, Joseph and Serigos, Jacqueline and Bullock, Barbara E and Toribio, Almeida Jacqueline},
	booktitle={Interspeech2017}
}

% Mandarin-English, SEAME
@inproceedings{lyu2010analysis,
	title={An analysis of a Mandarin-English code-switching speech corpus: SEAME},
	author={Lyu, Dau-Cheng and Tan, Tien-Ping and Chng, Eng-Siong and Li, Haizhou},
	booktitle={Interspeech2010}
}

% Mandarin-English, corpus
@inproceedings{chan2005development,
	title={Development of a Cantonese-English code-mixing speech corpus},
	author={Chan, Joyce YC and Ching, PC and Lee, Tan},
	booktitle={Eurospeech2005}
}

% French-Algerian, corpora
@incollection{amazouz2017addressing,
	title={Addressing Code-Switching in French/Algerian Arabic Speech},
	author={Amazouz, Djegdjiga and Adda-Decker, Martine and Lamel, Lori},
	booktitle={Interspeech2017}
}

% isiZulu-English, Language model, word embeddings.
@article{van2017synthesising,
	title={Synthesising isiZulu-English code-switch bigrams using word embeddings},
	author={van der Westhuizen, Ewald and Niesler, Thomas},
	journal={Interspeech2017}
}

@inproceedings{adel2013combination,
	title={Combination of recurrent neural networks and factored language models for code-switching language modeling},
	author={Adel, Heike and Vu, Ngoc Thang and Schultz, Tanja},
	booktitle={Proceedings of the 51st ACL2013}
}

@article{pan2010survey,
	title={A survey on transfer learning},
	author={Pan, Sinno Jialin and Yang, Qiang},
	journal={IEEE Transactions on knowledge and data engineering},
	volume={22},
	number={10},
	pages={1345--1359},
	year={2010},
	publisher={Institute of Electrical and Electronics Engineers, Inc., 345 E. 47 th St. NY NY 10017-2394 USA}
}

% ten language, attention-based end2end
@inproceedings{seki2018end,
	title={An End-to-End Language-Tracking Speech Recognizer for Mixed-Language Speech},
	author={Seki, Hiroshi and Watanabe, Shinji and Hori, Takaaki and Le Roux, Jonathan and Hershey, John R},
	booktitle={ICASSP2018}
}

@inproceedings{watanabe2017language,
	title={Language independent end-to-end architecture for joint language identification and speech recognition},
	author={Watanabe, Shinji and Hori, Takaaki and Hershey, John R},
	booktitle={ASRU2017}
}

@article{Y2016Investigating,
	title={Investigating Bilingual Deep Neural Networks for Automatic Recognition of Code-switching Frisian Speech},
	author={Y{\i}lmaz, Emre and Heuvel, Henk Van Den and Leeuwen, David Van},
	journal={Procedia Computer Science},
	year={2016},
}

@article{srivastava2014dropout,
	title={Dropout: a simple way to prevent neural networks from overfitting},
	author={Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan},
	journal={JMLR2014}
}

@inproceedings{weng2018improving,
	title={Improving Attention Based Sequence-to-Sequence Models for End-to-End English Conversational Speech Recognition},
	author={Weng, Chao and Cui, Jia and Wang, Guangsen and Wang, Jun and Yu, Chengzhu and Su, Dan and Yu, Dong},
	booktitle={Interspeech2018},
	%pages={761--765},
	%year={2018}
}

@inproceedings{karita2018semi,
	title={Semi-Supervised End-to-End Speech Recognition},
	author={Karita, Shigeki and Watanabe, Shinji and Iwata, Tomoharu and Ogawa, Atsunori and Delcroix, Marc},
	booktitle={Interspeech2018}
}

@inproceedings{sriram2017cold,
	title={Cold fusion: Training seq2seq models together with language models},
	author={Sriram, Anuroop and Jun, Heewoo and Satheesh, Sanjeev and Coates, Adam},
	booktitle={Interspeech2018}
}

@article{toshniwal2018comparison,
	title={A comparison of techniques for language model integration in encoder-decoder speech recognition},
	author={Toshniwal, Shubham and Kannan, Anjuli and Chiu, Chung-Cheng and Wu, Yonghui and Sainath, Tara N and Livescu, Karen},
	journal={arXiv preprint arXiv:1807.10857},
	year={2018}
}

@inproceedings{mikolov2010recurrent,
	title={Recurrent neural network based language model},
	author={Mikolov, Tom{\'a}{\v{s}} and Karafi{\'a}t, Martin and Burget, Luk{\'a}{\v{s}} and {\v{C}}ernock{\'y}, Jan and Khudanpur, Sanjeev},
	booktitle={Interspeech2010}
}

@inproceedings{hori2017multi,
	title={Multi-level language modeling and decoding for open vocabulary end-to-end speech recognition},
	author={Hori, Takaaki and Watanabe, Shinji and Hershey, John R},
	booktitle={ASRU2017}
}

@inproceedings{toshniwal2017multilingual,
	title={Multilingual speech recognition with a single end-to-end model},
	author={Toshniwal, Shubham and Sainath, Tara N. and Weiss, Ron J. and Li, Bo and Moreno, Pedro and Weinstein, Eugene and Rao, Kanishka},
	booktitle={ICASSP2018}
}

@inproceedings{chiu2018state,
	title={State-of-the-art speech recognition with sequence-to-sequence models},
	author={Chiu, Chung-Cheng and Sainath, Tara N and Wu, Yonghui and Prabhavalkar, Rohit and Nguyen, Patrick and Chen, Zhifeng and Kannan, Anjuli and Weiss, Ron J and Rao, Kanishka and Gonina, Ekaterina and others},
	booktitle={ICASSP2018}
}

@inproceedings{luong2015effective,
	title={Effective Approaches to Attention-based Neural Machine Translation},
	author={Luong, Thang and Pham, Hieu and Manning, Christopher D},
	booktitle={EMNLP2015}
}

@article{dahl2012context,
	title={Context-dependent pre-trained deep neural networks for large-vocabulary speech recognition},
	author={Dahl, George E and Yu, Dong and Deng, Li and Acero, Alex},
	journal={IEEE Transactions on audio, speech, and language processing},
	year={2012}
}

@article{sun2017unsupervised,
	title={An unsupervised deep domain adaptation approach for robust speech recognition},
	author={Sun, Sining and Zhang, Binbin and Xie, Lei and Zhang, Yanning},
	journal={Neurocomputing},
	year={2017},
	volume={257},
	pages={79--87},
	publisher={Elsevier}
}

@article{deng2014deep,
	title={Deep learning: methods and applications},
	author={Deng, Li and Yu, Dong and others},
	journal={Foundations and Trends{\textregistered} in Signal Processing},
	volume={7},
	number={3--4},
	pages={197--387},
	year={2014},
	publisher={Now Publishers, Inc.}
}

@article{gulcehre2015using,
	title={On using monolingual corpora in neural machine translation},
	author={Gulcehre, Caglar and Firat, Orhan and Xu, Kelvin and Cho, Kyunghyun and Barrault, Loic and Lin, Huei-Chi and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua},
	journal={arXiv preprint arXiv:1503.03535},
	year={2015}
}

@article{hechangqing2017,
	title={针对客服对话的语音识别和关键词检出},
	author={何长青},
	journal={西北工业大学},
	year={2017}
}

@article{zhangbinbin2017,
	title={基于CD-Phone和CTC的语音识别技术研究},
	author={张彬彬},
	journal={西北工业大学},
	year={2017}
}




@inproceedings{sainath2018improving,
  title={Improving the performance of online neural transducer models},
  author={Sainath, Tara N and Chiu, Chung-Cheng and Prabhavalkar, Rohit and Kannan, Anjuli and Wu, Yonghui and Nguyen, Patrick and Chen, ZhiJeng},
  booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={5864--5868},
  year={2018},
  organization={IEEE}
}

@article{chiu2017monotonic,
  title={Monotonic chunkwise attention},
  author={Chiu, Chung-Cheng and Raffel, Colin},
  journal={arXiv preprint arXiv:1712.05382},
  year={2017}
}

@inproceedings{moritz2019triggered,
  title={Triggered Attention for End-to-end Speech Recognition},
  author={Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
  booktitle={ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={5666--5670},
  year={2019},
  organization={IEEE}
}

@article{muda2010voice,
  title={Voice recognition algorithms using mel frequency cepstral coefficient (MFCC) and dynamic time warping (DTW) techniques},
  author={Muda, Lindasalwa and Begam, Mumtaj and Elamvazuthi, Irraivan},
  journal={arXiv preprint arXiv:1003.4083},
  year={2010}
}

@article{forney1973viterbi,
  title={The viterbi algorithm},
  author={Forney, G David},
  journal={Proceedings of the IEEE},
  volume={61},
  number={3},
  pages={268--278},
  year={1973},
  publisher={IEEE}
}

@inproceedings{chorowski2015attention,
  title={Attention-based models for speech recognition},
  author={Chorowski, Jan K and Bahdanau, Dzmitry and Serdyuk, Dmitriy and Cho, Kyunghyun and Bengio, Yoshua},
  booktitle={Advances in neural information processing systems},
  pages={577--585},
  year={2015}
}

@inproceedings{zhang2017very,
  title={Very deep convolutional networks for end-to-end speech recognition},
  author={Zhang, Yu and Chan, William and Jaitly, Navdeep},
  booktitle={2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={4845--4849},
  year={2017},
  organization={IEEE}
}

@article{lecun1995convolutional,
  title={Convolutional networks for images, speech, and time series},
  author={LeCun, Yann and Bengio, Yoshua and others},
  journal={The handbook of brain theory and neural networks},
  volume={3361},
  number={10},
  pages={1995},
  year={1995}
}

@inproceedings{szegedy2016rethinking,
  title={Rethinking the inception architecture for computer vision},
  author={Szegedy, Christian and Vanhoucke, Vincent and Ioffe, Sergey and Shlens, Jon and Wojna, Zbigniew},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2818--2826},
  year={2016}
}

@inproceedings{bengio2015scheduled,
  title={Scheduled sampling for sequence prediction with recurrent neural networks},
  author={Bengio, Samy and Vinyals, Oriol and Jaitly, Navdeep and Shazeer, Noam},
  booktitle={Advances in Neural Information Processing Systems},
  pages={1171--1179},
  year={2015}
}

@article{park2019specaugment,
  title={Specaugment: A simple data augmentation method for automatic speech recognition},
  author={Park, Daniel S and Chan, William and Zhang, Yu and Chiu, Chung-Cheng and Zoph, Barret and Cubuk, Ekin D and Le, Quoc V},
  journal={arXiv preprint arXiv:1904.08779},
  year={2019}
}

@inproceedings{shan2019component,
  title={Component Fusion: Learning Replaceable Language Model Component for End-to-end Speech Recognition System},
  author={Shan, Changhao and Weng, Chao and Wang, Guangsen and Su, Dan and Luo, Min and Yu, Dong and Xie, Lei},
  booktitle={ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={5361--5635},
  year={2019},
  organization={IEEE}
}

@article{watanabe2017hybrid,
  title={Hybrid CTC/attention architecture for end-to-end speech recognition},
  author={Watanabe, Shinji and Hori, Takaaki and Kim, Suyoun and Hershey, John R and Hayashi, Tomoki},
  journal={IEEE Journal of Selected Topics in Signal Processing},
  volume={11},
  number={8},
  pages={1240--1253},
  year={2017},
  publisher={IEEE}
}

@inproceedings{kim2017joint,
  title={Joint CTC-LAS based end-to-end speech recognition using multi-task learning},
  author={Kim, Suyoun and Hori, Takaaki and Watanabe, Shinji},
  booktitle={2017 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
  pages={4835--4839},
  year={2017},
  organization={IEEE}
}

@inproceedings{prabhavalkar2017comparison,
  title={A Comparison of Sequence-to-Sequence Models for Speech Recognition.},
  author={Prabhavalkar, Rohit and Rao, Kanishka and Sainath, Tara N and Li, Bo and Johnson, Leif and Jaitly, Navdeep},
  booktitle={Interspeech},
  pages={939--943},
  year={2017}
}

@article{chiu2017monotonic,
  title={Monotonic chunkwise attention},
  author={Chiu, Chung-Cheng and Raffel, Colin},
  journal={arXiv preprint arXiv:1712.05382},
  year={2017}
}

@article{bengio2003neural,
  title={A neural probabilistic language model},
  author={Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Jauvin, Christian},
  journal={Journal of machine learning research},
  volume={3},
  number={Feb},
  pages={1137--1155},
  year={2003}
}

@inproceedings{mikolov2010recurrent,
  title={Recurrent neural network based language model},
  author={Mikolov, Tom{\'a}{\v{s}} and Karafi{\'a}t, Martin and Burget, Luk{\'a}{\v{s}} and {\v{C}}ernock{\`y}, Jan and Khudanpur, Sanjeev},
  booktitle={Eleventh annual conference of the international speech communication association},
  year={2010}
}

@article{gauvain1994maximum,
  title={Maximum a posteriori estimation for multivariate Gaussian mixture observations of Markov chains},
  author={Gauvain, J-L and Lee, Chin-Hui},
  journal={IEEE transactions on speech and audio processing},
  volume={2},
  number={2},
  pages={291--298},
  year={1994},
  publisher={IEEE}
}

@book{young1993htk,
  title={The HTK hidden Markov model toolkit: Design and philosophy},
  author={Young, Steve J and Young, Sj},
  year={1993},
  publisher={University of Cambridge, Department of Engineering Cambridge, England}
}

